library(ggplot2)
library(stringi)
library(gridExtra)
library(dendextend)
library(kableExtra)
library(limma)
library(psych)
library(tidyverse)
library(CONSTANd)  # install from source: https://github.com/PDiracDelta/CONSTANd/

This notebook presents isobaric labeling data analysis strategy that includes data-driven normalization.

We will check how varying analysis components [summarization/normalization/differential abundance testing methods] changes end results of a quantitative proteomic study.

source('./other_functions.R')
source('./plotting_functions.R')

# you should either make a symbolic link in this directory
data.list <- readRDS('input_data.rds')
dat.l <- data.list$dat.l # data in long format
# dat.w <- data.list$dat.w # data in wide format
if ('X' %in% colnames(dat.l)) { dat.l$X <- NULL }

# remove shared peptides
shared.peptides <- dat.l %>% filter(!shared.peptide)

# keep spectra with isolation interference <30 and no missing quantification channels
dat.l <- dat.l %>% filter(isoInterOk & noNAs)

# which proteins were spiked in?
spiked.proteins <- dat.l %>% distinct(Protein) %>% filter(stri_detect(Protein, fixed='ups')) %>% pull %>% as.character

# which peptides were identified in each MS run?
unique.pep=dat.l %>% 
  group_by(Run) %>%
  distinct(Peptide) %>% 
  mutate(val=1)
unique.pep <- xtabs(val~Peptide+Run, data=unique.pep)
tmp <- apply(unique.pep, 1, function(x) all(x==1))
inner.peptides <- rownames(unique.pep)[tmp]

#TEMP -remove!
# tmp=dat.l %>% distinct(Protein) %>% pull %>% as.character
# dat.l <- dat.l %>% filter(Protein %in% c(tmp[sample(1:length(tmp), size=20)], spiked.proteins))
# specify # of varying component variants and their names
variant.names <- c('log2Intensity', 'Intensity', 'Ratio')
n.comp.variants <- length(variant.names)
scale.vec <- c('log', 'raw', 'raw')  # ratios are considered raw, because they are basically mean-normalized intensities
# pick reference condition for making plots / doing DEA
referenceCondition <- '0.5'
referenceChannels <- 'Norm'
# specify colours corresponding to biological conditions
condition.colour <- tribble(
  ~Condition, ~Colour,
  "0.125", 'black',
  "0.5", 'blue',
  "0.667", 'green',
  "1", 'red',
  "Norm", 'orange')
# create data frame with sample info (distinct Run,Channel, Sample, Condition, Colour)
sample.info <- get_sample_info(dat.l, condition.colour)
channelNames <- remove_factors(unique(sample.info$Channel))

1 Unit scale component

Which scale are the reporter ion intensities on?

dat.unit.l <- emptyList(variant.names)

1.1 log2 intensity

dat.unit.l[[1]] <- dat.l %>% mutate(response=log2(Intensity)) %>% select(-Intensity)

1.2 original intensity

dat.unit.l[[2]] <- dat.l %>% rename(response=Intensity)

1.3 intensity ratios

Calculate ratio’s (per PSM) with respect to average intensity within run, or in other words: each value is divided by the row mean.

# use half-wide data to compute within-run average of PSM channels corresponding to the reference Condition
refCols <- sample.info %>% filter(Condition==referenceCondition) %>% distinct(Channel) %>% pull
denom.df=dat.l %>% filter(Condition==referenceCondition) %>% pivot_wider(id_cols=-one_of('Condition', 'BioReplicate'),names_from='Channel', values_from='Intensity')
denom.df$denom=apply(denom.df[,refCols], 1, function(x) mean(x, na.rm=T))
denom.df=denom.df[,c('Run', 'Protein', 'Peptide', 'RT', 'Charge', 'PTM', 'denom')]
dat.unit.l$Ratio <- dat.l %>% left_join(denom.df[c('Run', 'Protein', 'Peptide', 'RT', 'Charge', 'PTM', 'denom')], by=c('Run', 'Protein', 'Peptide', 'RT', 'Charge', 'PTM')) %>% mutate(response=Intensity/denom) %>% select(-c(Intensity, denom)) 
grand_median <- unlist(lapply(dat.unit.l, function(x) median(x$response, na.rm=TRUE)))

2 Normalization component: medianSweeping (1)

# switch to wide format
dat.unit.w <- lapply(dat.unit.l, function(x) {
  pivot_wider(data = x, id_cols=-one_of(c('Condition', 'BioReplicate')), names_from=Channel, values_from=response)
})
# subtract the spectrum median log2intensity from the observed log2intensities
dat.norm.w <- dat.unit.w
dat.norm.w <- lapply(dat.norm.w, function(x) {
  x[,channelNames] <- sweep(x[,channelNames], 1, apply(x[,channelNames], 1, median, na.rm=T))
  return(x) } )

3 Summarization component: Median summarization

Summarize quantification values from PSM to peptide (first step) to protein (second step).

# normalized data
dat.norm.summ.w <- lapply(dat.norm.w, function(x) {
  # group by (run,)protein,peptide then summarize twice (once on each level)
  y <- x %>% group_by(Run, Protein, Peptide) %>% summarise_at(.vars = channelNames, .funs = median, na.rm=T) %>% summarise_at(.vars = channelNames, .funs = median, na.rm=T) %>% ungroup()
  return(y)
})

Notice that the row sums are not equal to Ncols anymore, because the median summarization does not preserve them (but mean summarization does).

Let’s also summarize the non-normalized data for comparison in the next section.

# non-normalized data
dat.nonnorm.summ.w <- lapply(dat.unit.w, function(x) {
  # group by (run,)protein,peptide then summarize twice (once on each level)
  y <- x %>% group_by(Run, Protein, Peptide) %>% summarise_at(.vars = channelNames, .funs = median, na.rm=T) %>% summarise_at(.vars = channelNames, .funs = median, na.rm=T) %>% ungroup()
  return(y)
})

4 Normalization component: medianSweeping (2)

# medianSweeping: in each channel, subtract median computed across all proteins within the channel
# do the above separately for each MS run
dat.norm.summ.w <- lapply(dat.norm.summ.w, function(x) {
  x.split <- split(x, x$Run)
  x.split.norm  <- lapply(x.split, function(y) {
    y[,channelNames] <- sweep(y[,channelNames], 2, apply(y[,channelNames], 2, median, na.rm=T) )
    return(y) } )
  dat.norm.summ.w <- bind_rows(x.split.norm)
} )

5 QC plots

# make data completely wide (also across runs)

## non-normalized data
dat.nonnorm.summ.w2 <- lapply(dat.nonnorm.summ.w, function(x) {
  return( x %>% pivot_wider(names_from = Run, values_from = all_of(channelNames), names_glue = "{Run}:{.value}") )
})

## normalized data
dat.norm.summ.w2 <- lapply(dat.norm.summ.w, function(x) {
  return( x %>% pivot_wider(names_from = Run, values_from = all_of(channelNames), names_glue = "{Run}:{.value}") )
})

5.1 Boxplots

# use (half-)wide format
for (i in 1: n.comp.variants){
  par(mfrow=c(1,2))
    boxplot_w(dat.nonnorm.summ.w[[i]],sample.info, paste('Raw', variant.names[i], sep='_'))
    boxplot_w(dat.norm.summ.w[[i]], sample.info, paste('Normalized', variant.names[i], sep='_'))
  par(mfrow=c(1,1))
}

5.2 MA plots

MA plots of two single samples taken from condition 1 and condition 0.125, measured in different MS runs (samples Mixture2_1:127C and Mixture1_2:129N, respectively).

# different unit variants require different computation of fold changes and average abuandance: additive or multiplicative scale; see maplot_ils function
# use wide2 format
for (i in 1: n.comp.variants){
  p1 <- maplot_ils(dat.nonnorm.summ.w2[[i]], 'Mixture2_1:127C', 'Mixture1_2:129N', scale.vec[i], paste('Raw', variant.names[i], sep='_'), spiked.proteins)
  p2 <- maplot_ils(dat.norm.summ.w2[[i]], 'Mixture2_1:127C', 'Mixture1_2:129N', scale.vec[i], paste('Normalized', variant.names[i], sep='_'), spiked.proteins)
  grid.arrange(p1, p2, ncol=2)
}

5.3 MA plots of all samples from condition 1 and condition 0.125 (quantification values averaged within condition).

# different unit variants require different computation of fold changes and average abuandance: additive or multiplicative scale; see maplot_ils function
channels.num <- sample.info %>% filter(Condition=='1') %>% select(Sample) %>% pull
channels.denom <- sample.info %>% filter(Condition=='0.125') %>% select(Sample) %>% pull

for (i in 1: n.comp.variants){
  p1 <- maplot_ils(dat.nonnorm.summ.w2[[i]], channels.num, channels.denom, scale=scale.vec[i], paste('Raw', variant.names[i], sep='_'), spiked.proteins)
  p2 <- maplot_ils(dat.norm.summ.w2[[i]], channels.num, channels.denom, scale=scale.vec[i], paste('Normalized', variant.names[i], sep='_'), spiked.proteins)
  grid.arrange(p1, p2, ncol=2)
}

5.4 CV (coefficient of variation) plots

dat.nonnorm.summ.l <- lapply(dat.nonnorm.summ.w, function(x){
  x$Mixture <- unlist(lapply(stri_split(x$Run,fixed='_'), function(y) y[1]))
  x <- to_long_format(x, sample.info)
})

dat.norm.summ.l <- lapply(dat.norm.summ.w, function(x){
  x$Mixture <- unlist(lapply(stri_split(x$Run,fixed='_'), function(y) y[1]))
  x <- to_long_format(x, sample.info)
})

par(mfrow=c(1, 2))
for (i in 1: n.comp.variants){
    cvplot_ils(dat=dat.nonnorm.summ.l[[i]], feature.group='Protein', xaxis.group='Condition',
               title=paste('Raw', variant.names[i], sep='_'))
    cvplot_ils(dat=dat.norm.summ.l[[i]], feature.group='Protein', xaxis.group='Condition',
               title=paste('Normalized', variant.names[i], sep='_'), add.constant=grand_median[i])
}

par(mfrow=c(1, 1))

5.5 PCA plots

5.5.1 Using all proteins

par(mfrow=c(1, 2))
for (i in seq_along(dat.norm.summ.w2)){
  pcaplot_ils(dat.nonnorm.summ.w2[[i]] %>% select(-'Protein'), info=sample.info, paste('Raw', variant.names[i], sep='_'))
  pcaplot_ils(dat.norm.summ.w2[[i]] %>% select(-'Protein'), info=sample.info, paste('Normalized', variant.names[i], sep='_'))
}

par(mfrow=c(1, 1))

5.5.2 Using spiked proteins only

for (i in seq_along(dat.norm.summ.w2)){
  par(mfrow=c(1, 2))
    pcaplot_ils(dat.nonnorm.summ.w2[[i]] %>% filter(Protein %in% spiked.proteins) %>% select(-'Protein'), info=sample.info, paste('Raw', variant.names[i], sep='_'))
    pcaplot_ils(dat.norm.summ.w2[[i]] %>% filter(Protein %in% spiked.proteins) %>% select(-'Protein'), info=sample.info, paste('Normalized', variant.names[i], sep='_'))
  par(mfrow=c(1, 1))
}

5.6 HC (hierarchical clustering) plots

5.6.1 Using all proteins

for (i in seq_along(dat.norm.summ.w2)){
  par(mfrow=c(1, 2))
    dendrogram_ils(dat.nonnorm.summ.w2[[i]] %>% select(-Protein), info=sample.info, paste('Raw', variant.names[i], sep='_'))
    dendrogram_ils(dat.norm.summ.w2[[i]] %>% select(-Protein), info=sample.info, paste('Normalized', variant.names[i], sep='_'))
  par(mfrow=c(1, 1))
}

6 DEA component: Moderated t-test

# drop referenChannels
chr <- sample.info %>% filter(Condition==referenceChannels) %>% distinct(Run:Channel) %>% pull %>% as.character
dat.norm.summ.w2 <- lapply(dat.norm.summ.w2, function(x) x %>% select(-one_of(chr)))
sample.info <- sample.info %>% filter(!(Condition %in% referenceChannels)) %>% droplevels

NOTE: - actually, lmFit (used in moderated_ttest) was built for log2-transformed data. However, supplying untransformed intensities can also work. This just means that the effects in the linear model are also additive on the untransformed scale, whereas for log-transformed data they are multiplicative on the untransformed scale. Also, there may be a bias which occurs from biased estimates of the population means in the t-tests, as mean(X) is not equal to exp(mean(log(X))).

#{ INVESTIGATE late log2 transform
dat.norm.summ.w2$Intensity_lateLog2 <- dat.norm.summ.w2$Intensity
channelNames.prefixed <- colnames(dat.norm.summ.w2$Intensity %>% select(-Protein))
dat.norm.summ.w2$Intensity_lateLog2[,channelNames.prefixed] <- log2(dat.norm.summ.w2$Intensity[,channelNames.prefixed])
variant.names <- names(dat.norm.summ.w2)
scale.vec <- c(scale.vec, 'log')
n.comp.variants <- n.comp.variants + 1
#}
design.matrix <- get_design_matrix(referenceCondition, sample.info)
dat.dea <- emptyList(names(dat.norm.summ.w2))
for (i in seq_along(dat.norm.summ.w2)) {
  this_scale <- scale.vec[match(names(dat.dea)[i], variant.names)]
  d <- column_to_rownames(as.data.frame(dat.norm.summ.w2[[i]]), 'Protein')
  dat.dea[[i]] <- moderated_ttest(dat=d, design.matrix, scale=this_scale)
}

7 Results comparison

7.1 Confusion matrix

cm <- conf_mat(dat.dea, 'q.mod', 0.05, spiked.proteins)
print_conf_mat(cm, referenceCondition)
0.125 vs 0.5 contrast
log2Intensity
Intensity
Ratio
Intensity_lateLog2
background spiked background spiked background spiked background spiked
not_DE 4056 4 4057 13 4055 7 3862 4
DE 3 15 2 6 4 12 2 0
0.125 vs 0.5 contrast
log2Intensity Intensity Ratio Intensity_lateLog2
Accuracy 0.998 0.996 0.997 0.998
Sensitivity 0.789 0.316 0.632 0.000
Specificity 0.999 1.000 0.999 0.999
PPV 0.833 0.750 0.750 0.000
NPV 0.999 0.997 0.998 0.999
0.667 vs 0.5 contrast
log2Intensity
Intensity
Ratio
Intensity_lateLog2
background spiked background spiked background spiked background spiked
not_DE 4059 17 4059 18 4058 16 3773 19
DE 0 2 0 1 1 3 3 0
0.667 vs 0.5 contrast
log2Intensity Intensity Ratio Intensity_lateLog2
Accuracy 0.996 0.996 0.996 0.994
Sensitivity 0.105 0.053 0.158 0.000
Specificity 1.000 1.000 1.000 0.999
PPV 1.000 1.000 0.750 0.000
NPV 0.996 0.996 0.996 0.995
1 vs 0.5 contrast
log2Intensity
Intensity
Ratio
Intensity_lateLog2
background spiked background spiked background spiked background spiked
not_DE 4054 5 4058 10 4054 4 3561 11
DE 5 14 1 9 5 15 0 1
1 vs 0.5 contrast
log2Intensity Intensity Ratio Intensity_lateLog2
Accuracy 0.998 0.997 0.998 0.997
Sensitivity 0.737 0.474 0.789 0.083
Specificity 0.999 1.000 0.999 1.000
PPV 0.737 0.900 0.750 1.000
NPV 0.999 0.998 0.999 0.997

7.2 Scatter plots

# character vectors containing logFC and p-values columns
dea.cols <- colnames(dat.dea[[1]])
logFC.cols <- dea.cols[stri_detect_fixed(dea.cols, 'logFC')]
significance.cols <- dea.cols[stri_detect_fixed(dea.cols, 'q.mod')]
n.contrasts <- length(logFC.cols)

scatterplot_ils(dat.dea, significance.cols, 'q-values', spiked.proteins)

scatterplot_ils(dat.dea, logFC.cols, 'log2FC', spiked.proteins)

7.3 Volcano plots

for (i in 1:n.contrasts){
  volcanoplot_ils(dat.dea, i, spiked.proteins)
}

7.4 Violin plots

Let’s see whether the spiked protein fold changes make sense

# plot theoretical value (horizontal lines) and violin per variant
violinplot_ils(lapply(dat.dea, function(x) x[spiked.proteins, logFC.cols]), referenceCondition)

8 Conclusions

9 Session information

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 18363)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United Kingdom.1252  LC_CTYPE=English_United Kingdom.1252   
## [3] LC_MONETARY=English_United Kingdom.1252 LC_NUMERIC=C                           
## [5] LC_TIME=English_United Kingdom.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] CONSTANd_0.99.0   forcats_0.5.0     stringr_1.4.0     dplyr_1.0.2       purrr_0.3.4      
##  [6] readr_1.4.0       tidyr_1.1.2       tibble_3.0.4      tidyverse_1.3.0   psych_2.0.9      
## [11] limma_3.44.3      kableExtra_1.3.1  dendextend_1.14.0 gridExtra_2.3     stringi_1.5.3    
## [16] ggplot2_3.3.2     rmarkdown_2.5    
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-149         fs_1.5.0             lubridate_1.7.9      webshot_0.5.2        httr_1.4.2          
##  [6] JGmisc_0.3           tools_4.0.3          backports_1.1.10     R6_2.5.0             rpart_4.1-15        
## [11] DBI_1.1.0            mgcv_1.8-33          colorspace_1.4-1     nnet_7.3-14          withr_2.3.0         
## [16] tidyselect_1.1.0     mnormt_2.0.2         compiler_4.0.3       cli_2.1.0            rvest_0.3.6         
## [21] xml2_1.3.2           labeling_0.4.2       scales_1.1.1         digest_0.6.26        pkgconfig_2.0.3     
## [26] htmltools_0.5.0      highr_0.8            dbplyr_2.0.0         rlang_0.4.8          readxl_1.3.1        
## [31] rstudioapi_0.12      farver_2.0.3         generics_0.1.0       jsonlite_1.7.1       ModelMetrics_1.2.2.2
## [36] magrittr_1.5         Matrix_1.2-18        Rcpp_1.0.5           munsell_0.5.0        fansi_0.4.1         
## [41] viridis_0.5.1        lifecycle_0.2.0      pROC_1.16.2          yaml_2.2.1           MASS_7.3-53         
## [46] plyr_1.8.6           recipes_0.1.15       grid_4.0.3           parallel_4.0.3       crayon_1.3.4        
## [51] lattice_0.20-41      haven_2.3.1          splines_4.0.3        hms_0.5.3            tmvnsim_1.0-2       
## [56] knitr_1.30           pillar_1.4.6         reshape2_1.4.4       codetools_0.2-16     stats4_4.0.3        
## [61] reprex_0.3.0         glue_1.4.2           evaluate_0.14        data.table_1.13.2    modelr_0.1.8        
## [66] vctrs_0.3.4          foreach_1.5.1        cellranger_1.1.0     gtable_0.3.0         assertthat_0.2.1    
## [71] xfun_0.18            gower_0.2.2          prodlim_2019.11.13   broom_0.7.2          e1071_1.7-4         
## [76] class_7.3-17         survival_3.2-7       viridisLite_0.3.0    timeDate_3043.102    iterators_1.0.13    
## [81] lava_1.6.8.1         ellipsis_0.3.1       caret_6.0-86         ipred_0.9-9